#dependencies and setup
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from openpyxl import Workbook
import numpy as np
import requests
import json
import re
from pprint import pprint
import time
# SQLite dependencies
import sqlite3
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, Float
from pandas_profiling import ProfileReport
# SQLite DB creation and establishing connection
database_path = "NJ_County_DB.sqlite"
engine = create_engine(f"sqlite:///{database_path}", echo=True)
sqlite_connection = engine.connect()
sql_query = """SELECT name FROM sqlite_master
WHERE type='table';"""
tbls= pd.read_sql(sql_query,sqlite_connection)
tbls
2023-03-27 13:59:15,441 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT name FROM sqlite_master
WHERE type='table';")
2023-03-27 13:59:15,445 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,447 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT name FROM sqlite_master
WHERE type='table';")
2023-03-27 13:59:15,448 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,450 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master
WHERE type='table';
2023-03-27 13:59:15,452 INFO sqlalchemy.engine.Engine [raw sql] ()
| name | |
|---|---|
| 0 | nj_mortgage_rates |
| 1 | nj_population |
| 2 | nj_zillow_house_value_index |
| 3 | nj_food_desert |
| 4 | nj_crime_detail |
| 5 | nj_school_performance |
| 6 | nj_zillow_observed_rent_index |
| 7 | nj_adi |
| 8 | nj_counties_dist_to_major_cities |
| 9 | nj_poverty_median_income |
| 10 | nj_property_tax |
We have 11 datasets.
row_count=[]
max_year=[]
min_year=[]
for i in tbls['name'].tolist():
try:
sql_query = f"""SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM {i};"""
df = pd.read_sql(sql_query,sqlite_connection)
row_count.append(df['count'].values[0])
min_year.append(df['min_year'].values[0])
max_year.append(df['max_year'].values[0])
except:
sql_query = f"""SELECT count(*) as count FROM {i};"""
df = pd.read_sql(sql_query,sqlite_connection)
row_count.append(df['count'].values[0])
min_year.append('NaN')
max_year.append('NaN')
tbls['row_count'] = row_count
tbls['min_year'] = min_year
tbls['max_year'] = max_year
tbls
2023-03-27 13:59:15,501 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_mortgage_rates;")
2023-03-27 13:59:15,504 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,508 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_mortgage_rates;")
2023-03-27 13:59:15,510 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,514 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_mortgage_rates;
2023-03-27 13:59:15,515 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,523 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_population;")
2023-03-27 13:59:15,524 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,527 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_population;")
2023-03-27 13:59:15,530 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,533 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_population;
2023-03-27 13:59:15,535 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,543 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_house_value_index;")
2023-03-27 13:59:15,545 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,547 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_house_value_index;")
2023-03-27 13:59:15,551 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,554 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_house_value_index;
2023-03-27 13:59:15,556 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,563 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_food_desert;")
2023-03-27 13:59:15,565 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,568 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_food_desert;")
2023-03-27 13:59:15,569 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,572 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_food_desert;
2023-03-27 13:59:15,573 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,575 INFO sqlalchemy.engine.Engine ROLLBACK
2023-03-27 13:59:15,576 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count FROM nj_food_desert;")
2023-03-27 13:59:15,578 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,581 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count FROM nj_food_desert;")
2023-03-27 13:59:15,582 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,584 INFO sqlalchemy.engine.Engine SELECT count(*) as count FROM nj_food_desert;
2023-03-27 13:59:15,586 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,593 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_crime_detail;")
2023-03-27 13:59:15,595 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,597 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_crime_detail;")
2023-03-27 13:59:15,598 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,601 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_crime_detail;
2023-03-27 13:59:15,602 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,614 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_school_performance;")
2023-03-27 13:59:15,616 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,618 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_school_performance;")
2023-03-27 13:59:15,620 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,623 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_school_performance;
2023-03-27 13:59:15,625 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,635 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:15,637 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,639 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:15,640 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,642 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_observed_rent_index;
2023-03-27 13:59:15,643 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,648 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_adi;")
2023-03-27 13:59:15,650 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,654 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_adi;")
2023-03-27 13:59:15,656 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,658 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_adi;
2023-03-27 13:59:15,659 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,661 INFO sqlalchemy.engine.Engine ROLLBACK
2023-03-27 13:59:15,663 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count FROM nj_adi;")
2023-03-27 13:59:15,664 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,669 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count FROM nj_adi;")
2023-03-27 13:59:15,672 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,674 INFO sqlalchemy.engine.Engine SELECT count(*) as count FROM nj_adi;
2023-03-27 13:59:15,678 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,683 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,687 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,689 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,691 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,693 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_counties_dist_to_major_cities;
2023-03-27 13:59:15,695 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,697 INFO sqlalchemy.engine.Engine ROLLBACK
2023-03-27 13:59:15,699 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,701 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,708 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,710 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,712 INFO sqlalchemy.engine.Engine SELECT count(*) as count FROM nj_counties_dist_to_major_cities;
2023-03-27 13:59:15,714 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,720 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_poverty_median_income;")
2023-03-27 13:59:15,722 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,724 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_poverty_median_income;")
2023-03-27 13:59:15,726 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,730 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_poverty_median_income;
2023-03-27 13:59:15,731 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,737 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_property_tax;")
2023-03-27 13:59:15,739 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,742 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_property_tax;")
2023-03-27 13:59:15,744 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,747 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_property_tax;
2023-03-27 13:59:15,748 INFO sqlalchemy.engine.Engine [raw sql] ()
| name | row_count | min_year | max_year | |
|---|---|---|---|---|
| 0 | nj_mortgage_rates | 32 | 1991 | 2022 |
| 1 | nj_population | 252 | 2010 | 2021 |
| 2 | nj_zillow_house_value_index | 2520 | 2000 | 2023 |
| 3 | nj_food_desert | 2002 | NaN | NaN |
| 4 | nj_crime_detail | 9646 | 2017 | 2020 |
| 5 | nj_school_performance | 8888 | 2017 | 2020 |
| 6 | nj_zillow_observed_rent_index | 180 | 2015 | 2023 |
| 7 | nj_adi | 6437 | NaN | NaN |
| 8 | nj_counties_dist_to_major_cities | 21 | NaN | NaN |
| 9 | nj_poverty_median_income | 462 | 2000 | 2021 |
| 10 | nj_property_tax | 12972 | 2000 | 2022 |
Most of the datasets are historical.
sql_query = """SELECT * FROM nj_crime_detail;"""
crime_df = pd.read_sql(sql_query,sqlite_connection)
crime_df.info()
2023-03-27 13:59:15,814 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_crime_detail;")
2023-03-27 13:59:15,816 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,825 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_crime_detail;")
2023-03-27 13:59:15,827 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,831 INFO sqlalchemy.engine.Engine SELECT * FROM nj_crime_detail;
2023-03-27 13:59:15,834 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9646 entries, 0 to 9645
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_name 9646 non-null object
1 year 9646 non-null int64
2 agency 9646 non-null object
3 report_type 9646 non-null object
4 population 9521 non-null float64
5 murder 9646 non-null float64
6 rape 9646 non-null float64
7 robbery 9646 non-null float64
8 assault 9646 non-null float64
9 burglary 9646 non-null float64
10 larceny 9646 non-null float64
11 auto_theft 9646 non-null float64
12 total 9646 non-null float64
dtypes: float64(9), int64(1), object(3)
memory usage: 979.8+ KB
# five number summary
crime_df.describe()
| year | population | murder | rape | robbery | assault | burglary | larceny | auto_theft | total | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 9,646.00 | 9,521.00 | 9,646.00 | 9,646.00 | 9,646.00 | 9,646.00 | 9,646.00 | 9,646.00 | 9,646.00 | 9,646.00 |
| mean | 2,018.80 | 15,751.89 | 2.36 | 5.75 | 15.05 | 35.78 | 51.62 | 333.61 | 25.58 | 446.59 |
| std | 0.98 | 24,654.68 | 13.93 | 17.90 | 74.20 | 218.42 | 128.47 | 3,398.15 | 123.21 | 3,594.00 |
| min | 2,017.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 2,018.00 | 2,934.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 | 0.00 | 10.00 |
| 50% | 2,019.00 | 8,271.00 | 0.00 | 0.00 | 0.00 | 5.00 | 7.00 | 25.00 | 1.00 | 36.00 |
| 75% | 2,020.00 | 18,874.00 | 0.00 | 1.00 | 8.00 | 40.77 | 38.00 | 198.15 | 17.00 | 292.00 |
| max | 2,020.00 | 283,673.00 | 232.80 | 300.00 | 5,797.10 | 20,000.00 | 2,068.30 | 194,117.60 | 7,352.90 | 202,941.20 |
df = crime_df[crime_df['report_type']=='Rate Per 100,000'].groupby(['county_name', 'year'],\
as_index=False).mean()
# crime by county trend
fig = px.line(df, x="year", y = 'total', title='Crime by County',\
color='county_name', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [2017,2018,2019,2020],
ticktext = [2017,2018,2019,2020]
),
yaxis_title="Crime Per 100k population"
)
fig.show()
Crime rate is going down in most of the counties across years, except Cape May.
# crime by type trend
crime_type=df.groupby('year',as_index=False).mean()
fig = px.line(crime_type, x="year", y = crime_type.columns[2:-1], title='Crime by types', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [2017,2018,2019,2020],
ticktext = [2017,2018,2019,2020]
),
yaxis_title="Crime Per 100k population"
)
fig.show()
Most frequent violent crime in NJ is larceny which is also going down along with all other violent crimes over time.
sql_query = """SELECT * FROM nj_poverty_median_income;"""
pov_mhi_df = pd.read_sql(sql_query,sqlite_connection)
pov_mhi_df.info()
2023-03-27 13:59:18,989 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_poverty_median_income;")
2023-03-27 13:59:18,990 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:18,992 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_poverty_median_income;")
2023-03-27 13:59:18,994 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:18,996 INFO sqlalchemy.engine.Engine SELECT * FROM nj_poverty_median_income;
2023-03-27 13:59:18,997 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_name 462 non-null object
1 median_hh_income 462 non-null int64
2 poverty_count 462 non-null int64
3 poverty_rate 462 non-null float64
4 st_abb 462 non-null object
5 year 462 non-null int64
6 state_code 462 non-null object
7 county_code 462 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 29.0+ KB
# five number summary
pov_mhi_df.describe()
| median_hh_income | poverty_count | poverty_rate | year | |
|---|---|---|---|---|
| count | 462.00 | 462.00 | 462.00 | 462.00 |
| mean | 69,642.02 | 39,028.81 | 9.24 | 2,010.50 |
| std | 18,676.75 | 30,750.71 | 3.96 | 6.35 |
| min | 36,574.00 | 3,178.00 | 2.50 | 2,000.00 |
| 25% | 54,819.75 | 12,870.75 | 5.90 | 2,005.00 |
| 50% | 66,456.00 | 29,364.50 | 8.50 | 2,010.50 |
| 75% | 80,453.75 | 58,289.75 | 11.90 | 2,016.00 |
| max | 123,708.00 | 136,161.00 | 20.10 | 2,021.00 |
# trend
fig = px.line(pov_mhi_df, x="year", y="poverty_rate", title='Poverty', color='county_name', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [x for x in range(2000,2022)],
ticktext = [x for x in range(2000,2022)]
)
)
fig.show()
Poverty rate is going up and down since 2000 through 2021. But unltimately poverty rate in 2021 is more than what it was back in 2000.
# trend
fig = px.line(pov_mhi_df, x="year", y="median_hh_income", title='Median Income', color='county_name', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [x for x in range(2000,2022)],
ticktext = [x for x in range(2000,2022)]
)
)
fig.show()
Median income is steadily going up across time.
sql_query = """SELECT * FROM nj_school_performance;"""
school_df = pd.read_sql(sql_query,sqlite_connection)
school_df.info()
2023-03-27 13:59:19,643 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_school_performance;")
2023-03-27 13:59:19,645 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:19,648 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_school_performance;")
2023-03-27 13:59:19,652 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:19,656 INFO sqlalchemy.engine.Engine SELECT * FROM nj_school_performance;
2023-03-27 13:59:19,659 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8888 entries, 0 to 8887
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 rank 8888 non-null int64
1 school 8888 non-null object
2 grades 8888 non-null object
3 district 8888 non-null object
4 students 8888 non-null int64
5 free_lunch_rec 8863 non-null float64
6 total_exp 8790 non-null float64
7 score 8888 non-null float64
8 year 8888 non-null int64
9 school_type 8888 non-null object
10 type 8888 non-null object
11 address 8888 non-null object
12 city 8888 non-null object
13 zip 8888 non-null object
14 county_name 8888 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 1.0+ MB
# five number summary
school_df.describe()
| rank | students | free_lunch_rec | total_exp | score | year | |
|---|---|---|---|---|---|---|
| count | 8,888.00 | 8,888.00 | 8,863.00 | 8,790.00 | 8,888.00 | 8,888.00 |
| mean | 505.87 | 593.56 | 37.71 | 11,537.00 | 49.73 | 2,018.42 |
| std | 369.59 | 450.28 | 29.97 | 3,494.85 | 27.08 | 1.12 |
| min | 1.00 | 0.00 | 0.00 | 337.00 | 1.20 | 2,017.00 |
| 25% | 198.00 | 333.00 | 10.50 | 9,233.00 | 26.40 | 2,017.00 |
| 50% | 403.50 | 471.00 | 30.90 | 11,210.00 | 49.30 | 2,018.00 |
| 75% | 766.25 | 708.00 | 63.40 | 13,433.00 | 73.90 | 2,019.00 |
| max | 1,336.00 | 6,298.00 | 100.00 | 33,152.00 | 99.20 | 2,020.00 |
# trend - County
df = school_df.groupby(['year', 'county_name']).agg(MeanScore=('score', 'mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
# trend
fig = px.line(df, x="year", y="MeanScore", title='School Scores by County', color='county_name', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [2017,2018,2019,2020],
ticktext = [2017,2018,2019,2020]
)
)
fig.show()
On average, NJ schools maintained their quality/ranking over time with a couple of exceptions like Passiac, Sussex and Bergen.
# trend - Shcool types/ phases
df = school_df.groupby(['year', 'school_type']).agg(MeanScore=('score', 'mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
# trend
fig = px.line(df, x="year", y="MeanScore", title='School Scores by School Types', color='school_type', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [2017,2018,2019,2020],
ticktext = [2017,2018,2019,2020]
)
)
fig.show()
Mean scores for elementary schools went down. Middle schools saw a steep decline from 2018-2019 then got back up in 2020. High school scores went up by a lot since 2019 onwards.
# trend - type
df = school_df.groupby(['year', 'type']).agg(MeanScore=('score', 'mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
# trend
fig = px.line(df, x="year", y="MeanScore", title='School Scores by Type', color='type', markers=True)
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = [2017,2018,2019,2020],
ticktext = [2017,2018,2019,2020]
)
)
fig.show()
Mean scores for Public,Alternative schools went down. Public, Charter, Alternative schools scores saw a decline from 2018. Public school maintained their scores and Public, Charter schools scores decreased a little over time.
sql_query = """SELECT * FROM nj_zillow_house_value_index;"""
zhvi_df = pd.read_sql(sql_query,sqlite_connection)
zhvi_df.info()
2023-03-27 13:59:20,520 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_zillow_house_value_index;")
2023-03-27 13:59:20,522 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:20,525 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_zillow_house_value_index;")
2023-03-27 13:59:20,527 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:20,528 INFO sqlalchemy.engine.Engine SELECT * FROM nj_zillow_house_value_index;
2023-03-27 13:59:20,530 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_name 2520 non-null object
1 year 2520 non-null int64
2 num_of_bedrooms 2520 non-null int64
3 house_value_index 2500 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 78.9+ KB
# five number summary
zhvi_df.describe()
| year | num_of_bedrooms | house_value_index | |
|---|---|---|---|
| count | 2,520.00 | 2,520.00 | 2,500.00 |
| mean | 2,011.50 | 3.00 | 300,644.65 |
| std | 6.92 | 1.41 | 180,913.97 |
| min | 2,000.00 | 1.00 | 24,948.62 |
| 25% | 2,005.75 | 2.00 | 169,532.93 |
| 50% | 2,011.50 | 3.00 | 258,947.71 |
| 75% | 2,017.25 | 4.00 | 388,660.31 |
| max | 2,023.00 | 5.00 | 1,620,305.39 |
# trend
df = zhvi_df.groupby(['year','num_of_bedrooms'],as_index=False).agg({'house_value_index':'mean'})
fig = px.line(df, x="year", y = 'house_value_index', color='num_of_bedrooms',\
title='Price by bedrooms in NJ', markers=True)
fig.show()
House prices for all bedrooms are going up over time except for a short decline from 2007 to 2012.
# trend
df = zhvi_df.groupby(['year','county_name'],as_index=False).agg({'house_value_index':'mean'})
fig = px.line(df, x="year", y = 'house_value_index', color='county_name',\
title='Price by county in NJ', markers=True)
fig.show()
House prices for all counties are going up over time except for a short decline from 2007 to 2012.
sql_query = """SELECT * FROM nj_zillow_observed_rent_index;"""
zori_df = pd.read_sql(sql_query,sqlite_connection)
zori_df.info()
2023-03-27 13:59:21,099 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:21,105 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,112 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:21,114 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,116 INFO sqlalchemy.engine.Engine SELECT * FROM nj_zillow_observed_rent_index;
2023-03-27 13:59:21,118 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_name 180 non-null object
1 year 180 non-null int64
2 observed_rent_index 157 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.3+ KB
# five number summary
zori_df.describe()
| year | observed_rent_index | |
|---|---|---|
| count | 180.00 | 157.00 |
| mean | 2,019.00 | 1,876.35 |
| std | 2.59 | 384.72 |
| min | 2,015.00 | 1,185.20 |
| 25% | 2,017.00 | 1,576.68 |
| 50% | 2,019.00 | 1,850.26 |
| 75% | 2,021.00 | 2,147.89 |
| max | 2,023.00 | 2,855.24 |
# trend
fig = px.line(zori_df, x="year", y = 'observed_rent_index', color='county_name',\
title='Rent Price by county in NJ', markers=True)
fig.show()
Rental prices for all counties are going up over time.
sql_query = """SELECT * FROM nj_property_tax;"""
tax_df = pd.read_sql(sql_query,sqlite_connection)
tax_df.info()
2023-03-27 13:59:21,361 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_property_tax;")
2023-03-27 13:59:21,364 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,367 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_property_tax;")
2023-03-27 13:59:21,369 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,374 INFO sqlalchemy.engine.Engine SELECT * FROM nj_property_tax;
2023-03-27 13:59:21,376 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12972 entries, 0 to 12971
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_code 12972 non-null object
1 county_name 12972 non-null object
2 district_code 12972 non-null object
3 district_name 12972 non-null object
4 year 12972 non-null int64
5 tax_rate 12972 non-null float64
dtypes: float64(1), int64(1), object(4)
memory usage: 608.2+ KB
# five number summary
tax_df.describe()
| year | tax_rate | |
|---|---|---|
| count | 12,972.00 | 12,972.00 |
| mean | 2,011.00 | 3.23 |
| std | 6.63 | 2.31 |
| min | 2,000.00 | 0.00 |
| 25% | 2,005.00 | 2.16 |
| 50% | 2,011.00 | 2.82 |
| 75% | 2,017.00 | 3.63 |
| max | 2,022.00 | 37.77 |
# trend
df = tax_df.groupby(['year', 'county_name']).agg(tax_rate=('tax_rate','mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
fig = px.line(df, x="year", y="tax_rate", title='Property Tax Rate', color='county_name', markers=True)
fig.show()
Essex property tax saw a huge decline since 2005 through 2011. Union property tax was on the rise since 2000 till 2015 and then it started to decline. Hudson also saw increase in property tax up until 2016 and then it went down. The rest of the counties kind of maintained same property tax across time.
tax_df[tax_df['year']==2022].groupby(['county_name']).agg({'tax_rate':['min','mean','max']})
| tax_rate | |||
|---|---|---|---|
| min | mean | max | |
| county_name | |||
| ATLANTIC | 1.01 | 3.01 | 5.45 |
| BERGEN | 0.74 | 2.55 | 4.23 |
| BURLINGTON | 1.28 | 3.08 | 4.87 |
| CAMDEN | 1.74 | 4.29 | 7.83 |
| CAPE MAY | 0.57 | 1.43 | 2.83 |
| CUMBERLAND | 2.37 | 3.30 | 5.03 |
| ESSEX | 1.94 | 3.14 | 5.96 |
| GLOUCESTER | 2.09 | 3.66 | 4.89 |
| HUDSON | 1.58 | 4.01 | 10.54 |
| HUNTERDON | 2.09 | 2.94 | 4.25 |
| MERCER | 2.43 | 3.36 | 5.46 |
| MIDDLESEX | 1.78 | 5.07 | 11.46 |
| MONMOUTH | 0.52 | 1.74 | 3.03 |
| MORRIS | 1.13 | 2.56 | 3.27 |
| OCEAN | 0.70 | 1.99 | 3.25 |
| PASSAIC | 2.44 | 4.10 | 5.71 |
| SALEM | 1.62 | 3.73 | 7.52 |
| SOMERSET | 1.33 | 2.37 | 4.08 |
| SUSSEX | 0.72 | 3.27 | 4.56 |
| UNION | 2.00 | 7.94 | 30.58 |
| WARREN | 2.31 | 3.52 | 5.87 |
The table above shows the lowest, average and highest property taxes by NJ counties.
sql_query = """SELECT * FROM nj_mortgage_rates;"""
apr_df = pd.read_sql(sql_query,sqlite_connection)
apr_df.info()
2023-03-27 13:59:21,907 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_mortgage_rates;")
2023-03-27 13:59:21,909 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,912 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_mortgage_rates;")
2023-03-27 13:59:21,913 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,914 INFO sqlalchemy.engine.Engine SELECT * FROM nj_mortgage_rates;
2023-03-27 13:59:21,916 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 32 non-null int64
1 apr_30 32 non-null float64
2 points_30 32 non-null float64
3 apr_15 32 non-null float64
4 points_15 32 non-null float64
dtypes: float64(4), int64(1)
memory usage: 1.4 KB
# five number summary
apr_df.describe()
| year | apr_30 | points_30 | apr_15 | points_15 | |
|---|---|---|---|---|---|
| count | 32.00 | 32.00 | 32.00 | 32.00 | 32.00 |
| mean | 2,006.50 | 5.80 | 0.91 | 5.25 | 0.89 |
| std | 9.38 | 1.75 | 0.47 | 1.86 | 0.46 |
| min | 1,991.00 | 2.96 | 0.43 | 2.27 | 0.45 |
| 25% | 1,998.75 | 4.12 | 0.59 | 3.37 | 0.58 |
| 50% | 2,006.50 | 5.85 | 0.72 | 5.31 | 0.68 |
| 75% | 2,014.25 | 7.35 | 1.00 | 6.88 | 0.99 |
| max | 2,022.00 | 8.79 | 1.83 | 8.40 | 1.77 |
# trend
fig = px.line(apr_df, x="year", y = apr_df.columns[1:], title='Mortgage APR', markers=True)
fig.show()
APR for 15 years and 30 yearsd were going down steadily since 1990, it started to shoot up since 2021 onwards. Points for both 15 and 30 years mortgages declined a little bit but mostly stayed the same.
sql_query = """SELECT * FROM nj_population;"""
pop_df = pd.read_sql(sql_query,sqlite_connection)
pop_df.info()
2023-03-27 13:59:22,096 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_population;")
2023-03-27 13:59:22,098 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,103 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_population;")
2023-03-27 13:59:22,106 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,108 INFO sqlalchemy.engine.Engine SELECT * FROM nj_population;
2023-03-27 13:59:22,109 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_name 252 non-null object
1 est_pop 252 non-null int64
2 year 252 non-null int64
dtypes: int64(2), object(1)
memory usage: 6.0+ KB
# five number summary
pop_df.describe()
| est_pop | year | |
|---|---|---|
| count | 252.00 | 252.00 |
| mean | 423,663.90 | 2,015.50 |
| std | 254,065.46 | 3.46 |
| min | 62,341.00 | 2,010.00 |
| 25% | 150,928.75 | 2,012.75 |
| 50% | 448,449.00 | 2,015.50 |
| 75% | 603,111.25 | 2,018.25 |
| max | 953,819.00 | 2,021.00 |
# trend
fig = px.line(pop_df, x="year", y = 'est_pop', color='county_name',\
title='Population (Est) in NJ', markers=True)
fig.show()
Population counts (estimated) are increasing very slightly over the years for all the counties.
sql_query = """SELECT * FROM nj_food_desert;"""
food_df = pd.read_sql(sql_query,sqlite_connection)
food_df.info(verbose=True, null_counts=True)
2023-03-27 13:59:22,360 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_food_desert;")
2023-03-27 13:59:22,361 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,363 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_food_desert;")
2023-03-27 13:59:22,365 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,369 INFO sqlalchemy.engine.Engine SELECT * FROM nj_food_desert;
2023-03-27 13:59:22,370 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002 entries, 0 to 2001
Data columns (total 148 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 censustract 2002 non-null int64
1 state 2002 non-null object
2 county 2002 non-null object
3 urban 2002 non-null int64
4 pop2010 2002 non-null int64
5 ohu2010 2002 non-null int64
6 groupquartersflag 2002 non-null int64
7 numgqtrs 2002 non-null float64
8 pctgqtrs 2002 non-null float64
9 lilatracts_1and10 2002 non-null int64
10 lilatracts_halfand10 2002 non-null int64
11 lilatracts_1and20 2002 non-null int64
12 lilatracts_vehicle 2002 non-null int64
13 hunvflag 2002 non-null int64
14 lowincometracts 2002 non-null int64
15 povertyrate 2002 non-null float64
16 medianfamilyincome 1988 non-null float64
17 la1and10 2002 non-null int64
18 lahalfand10 2002 non-null int64
19 la1and20 2002 non-null int64
20 latracts_half 2002 non-null int64
21 latracts1 2002 non-null int64
22 latracts10 2002 non-null int64
23 latracts20 2002 non-null int64
24 latractsvehicle_20 2002 non-null int64
25 lapop1_10 1079 non-null float64
26 lapop05_10 1603 non-null float64
27 lapop1_20 1068 non-null float64
28 lalowi1_10 1079 non-null float64
29 lalowi05_10 1603 non-null float64
30 lalowi1_20 1068 non-null float64
31 lapophalf 1742 non-null float64
32 lapophalfshare 1742 non-null float64
33 lalowihalf 1742 non-null float64
34 lalowihalfshare 1742 non-null float64
35 lakidshalf 1742 non-null float64
36 lakidshalfshare 1742 non-null float64
37 laseniorshalf 1742 non-null float64
38 laseniorshalfshare 1742 non-null float64
39 lawhitehalf 1742 non-null float64
40 lawhitehalfshare 1742 non-null float64
41 lablackhalf 1742 non-null float64
42 lablackhalfshare 1742 non-null float64
43 laasianhalf 1742 non-null float64
44 laasianhalfshare 1742 non-null float64
45 lanhopihalf 1742 non-null float64
46 lanhopihalfshare 1742 non-null float64
47 laaianhalf 1742 non-null float64
48 laaianhalfshare 1742 non-null float64
49 laomultirhalf 1742 non-null float64
50 laomultirhalfshare 1742 non-null float64
51 lahisphalf 1742 non-null float64
52 lahisphalfshare 1742 non-null float64
53 lahunvhalf 1742 non-null float64
54 lahunvhalfshare 1743 non-null float64
55 lasnaphalf 1742 non-null float64
56 lasnaphalfshare 1743 non-null float64
57 lapop1 1218 non-null float64
58 lapop1share 1218 non-null float64
59 lalowi1 1218 non-null float64
60 lalowi1share 1218 non-null float64
61 lakids1 1218 non-null float64
62 lakids1share 1218 non-null float64
63 laseniors1 1218 non-null float64
64 laseniors1share 1218 non-null float64
65 lawhite1 1218 non-null float64
66 lawhite1share 1218 non-null float64
67 lablack1 1218 non-null float64
68 lablack1share 1218 non-null float64
69 laasian1 1218 non-null float64
70 laasian1share 1218 non-null float64
71 lanhopi1 1218 non-null float64
72 lanhopi1share 1218 non-null float64
73 laaian1 1218 non-null float64
74 laaian1share 1218 non-null float64
75 laomultir1 1218 non-null float64
76 laomultir1share 1218 non-null float64
77 lahisp1 1218 non-null float64
78 lahisp1share 1218 non-null float64
79 lahunv1 1218 non-null float64
80 lahunv1share 1220 non-null float64
81 lasnap1 1218 non-null float64
82 lasnap1share 1220 non-null float64
83 lapop10 11 non-null float64
84 lapop10share 11 non-null float64
85 lalowi10 11 non-null float64
86 lalowi10share 11 non-null float64
87 lakids10 11 non-null float64
88 lakids10share 11 non-null float64
89 laseniors10 11 non-null float64
90 laseniors10share 11 non-null float64
91 lawhite10 11 non-null float64
92 lawhite10share 11 non-null float64
93 lablack10 11 non-null float64
94 lablack10share 11 non-null float64
95 laasian10 11 non-null float64
96 laasian10share 11 non-null float64
97 lanhopi10 11 non-null float64
98 lanhopi10share 11 non-null float64
99 laaian10 11 non-null float64
100 laaian10share 11 non-null float64
101 laomultir10 11 non-null float64
102 laomultir10share 11 non-null float64
103 lahisp10 11 non-null float64
104 lahisp10share 11 non-null float64
105 lahunv10 11 non-null float64
106 lahunv10share 13 non-null float64
107 lasnap10 11 non-null float64
108 lasnap10share 13 non-null float64
109 lapop20 0 non-null object
110 lapop20share 0 non-null object
111 lalowi20 0 non-null object
112 lalowi20share 0 non-null object
113 lakids20 0 non-null object
114 lakids20share 0 non-null object
115 laseniors20 0 non-null object
116 laseniors20share 0 non-null object
117 lawhite20 0 non-null object
118 lawhite20share 0 non-null object
119 lablack20 0 non-null object
120 lablack20share 0 non-null object
121 laasian20 0 non-null object
122 laasian20share 0 non-null object
123 lanhopi20 0 non-null object
124 lanhopi20share 0 non-null object
125 laaian20 0 non-null object
126 laaian20share 0 non-null object
127 laomultir20 0 non-null object
128 laomultir20share 0 non-null object
129 lahisp20 0 non-null object
130 lahisp20share 0 non-null object
131 lahunv20 0 non-null object
132 lahunv20share 3 non-null float64
133 lasnap20 0 non-null object
134 lasnap20share 3 non-null float64
135 tractlowi 2002 non-null float64
136 tractkids 2002 non-null float64
137 tractseniors 2002 non-null float64
138 tractwhite 2002 non-null float64
139 tractblack 2002 non-null float64
140 tractasian 2002 non-null float64
141 tractnhopi 2002 non-null float64
142 tractaian 2002 non-null float64
143 tractomultir 2002 non-null float64
144 tracthispanic 2002 non-null float64
145 tracthunv 2002 non-null float64
146 tractsnap 2002 non-null float64
147 county_name 2002 non-null object
dtypes: float64(102), int64(19), object(27)
memory usage: 2.3+ MB
<ipython-input-31-02ebe01409f7>:3: FutureWarning: null_counts is deprecated. Use show_counts instead
# five number summary
food_df.describe()
| censustract | urban | pop2010 | ohu2010 | groupquartersflag | numgqtrs | pctgqtrs | lilatracts_1and10 | lilatracts_halfand10 | lilatracts_1and20 | ... | tractseniors | tractwhite | tractblack | tractasian | tractnhopi | tractaian | tractomultir | tracthispanic | tracthunv | tractsnap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | ... | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 | 2,002.00 |
| mean | 34,019,297,884.01 | 0.93 | 4,391.56 | 1,605.57 | 0.01 | 93.34 | 2.19 | 0.05 | 0.17 | 0.05 | ... | 592.40 | 3,011.61 | 601.81 | 362.50 | 1.52 | 14.50 | 399.61 | 776.80 | 183.70 | 144.89 |
| std | 11,281,186.46 | 0.26 | 1,779.84 | 651.96 | 0.07 | 378.36 | 8.72 | 0.23 | 0.38 | 0.22 | ... | 393.53 | 1,732.43 | 843.23 | 576.24 | 2.70 | 19.06 | 471.97 | 975.97 | 211.90 | 158.42 |
| min | 34,001,000,100.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 34,009,020,725.00 | 1.00 | 3,125.50 | 1,138.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 343.00 | 1,764.75 | 81.00 | 50.00 | 0.00 | 4.00 | 114.00 | 202.25 | 43.00 | 33.00 |
| 50% | 34,019,010,801.50 | 1.00 | 4,194.50 | 1,546.00 | 0.00 | 5.00 | 0.11 | 0.00 | 0.00 | 0.00 | ... | 513.50 | 2,887.00 | 236.00 | 151.00 | 0.00 | 8.00 | 223.00 | 388.00 | 108.00 | 88.00 |
| 75% | 34,027,045,901.75 | 1.00 | 5,536.75 | 2,006.00 | 0.00 | 38.00 | 0.92 | 0.00 | 0.00 | 0.00 | ... | 744.00 | 4,085.25 | 739.00 | 412.00 | 2.00 | 18.00 | 463.75 | 894.25 | 248.00 | 201.00 |
| max | 34,041,032,400.00 | 1.00 | 15,765.00 | 5,270.00 | 1.00 | 6,451.00 | 100.00 | 1.00 | 1.00 | 1.00 | ... | 3,482.00 | 11,217.00 | 5,126.00 | 6,680.00 | 32.00 | 306.00 | 3,921.00 | 6,659.00 | 2,520.00 | 1,043.00 |
8 rows × 121 columns
# trend
df = food_df.groupby(['county_name'],as_index=False).agg({'la1and10':'sum','lahalfand10':'sum',
'la1and20':'sum'})
fig = px.bar(df, x="county_name", y = df.columns[1:],\
title='Food desert tracts by county in NJ', barmode='group')
fig.show()
Food desert counts are mostly same for both 1 mile radius in urban areas and 10 mile radius in rural areas and 1 mile radius in urban areas and 20 mile radius in rural areas. Counts are high for half mile radius in urban areas and 10 mile radius for rural areas for Middlesex, Bergen and Monmouth.*
sql_query = """SELECT * FROM nj_adi;"""
adi_df = pd.read_sql(sql_query,sqlite_connection)
adi_df.info()
2023-03-27 13:59:22,814 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_adi;")
2023-03-27 13:59:22,817 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,821 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_adi;")
2023-03-27 13:59:22,823 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,825 INFO sqlalchemy.engine.Engine SELECT * FROM nj_adi;
2023-03-27 13:59:22,827 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6437 entries, 0 to 6436
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 GISJOIN 6437 non-null object
1 ADI_NATRANK 6437 non-null float64
2 ADI_STATERNK 6437 non-null float64
3 FIPS 6437 non-null object
4 county_code 6437 non-null object
5 county_name 6437 non-null object
dtypes: float64(2), object(4)
memory usage: 301.9+ KB
# five number summary
adi_df.describe()
| ADI_NATRANK | ADI_STATERNK | |
|---|---|---|
| count | 6,437.00 | 6,437.00 |
| mean | 31.45 | 5.50 |
| std | 20.65 | 2.87 |
| min | 1.00 | 1.00 |
| 25% | 16.00 | 3.00 |
| 50% | 28.00 | 5.00 |
| 75% | 43.00 | 8.00 |
| max | 100.00 | 10.00 |
# trend
df = adi_df.groupby(['county_name'],as_index=False).agg({'ADI_NATRANK':'mean',
'ADI_STATERNK':'mean'})
fig = px.bar(df, x="county_name", y = df.columns[1:],\
title='ADI Ranking by county in NJ', barmode='group')
fig.show()
Counties like Cumberland, Salem, Camde and Atlantic are ranked pretty far down the list of counties nationwide in terms of area deprivation index, which transaltes low socio economic and educational opportunities.
sql_query = """SELECT * FROM nj_counties_dist_to_major_cities;"""
distance_df = pd.read_sql(sql_query,sqlite_connection)
distance_df.info()
2023-03-27 13:59:23,015 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:23,018 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:23,021 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:23,024 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:23,026 INFO sqlalchemy.engine.Engine SELECT * FROM nj_counties_dist_to_major_cities;
2023-03-27 13:59:23,028 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 county_name 21 non-null object
1 county_code 21 non-null object
2 dist_to_manhattan 21 non-null float64
3 dist_to_queens 21 non-null float64
4 dist_to_brooklyn 21 non-null float64
5 dist_to_bronx 21 non-null float64
6 dist_to_jersery_city 21 non-null float64
7 dist_to_newark 21 non-null float64
8 dist_to_trenton 21 non-null float64
9 dist_to_philadelphia 21 non-null float64
dtypes: float64(8), object(2)
memory usage: 1.8+ KB
# trend
for i in distance_df.select_dtypes('float').columns:
fig=px.bar(distance_df, x="county_name", y = i,\
title=f'''Distance from {i.replace('dist_to_','').title()} by county in NJ''',\
barmode='group')
fig.show()
The charts above shows distances from major cities by counties.
# close connection
sqlite_connection.close()
# Aggregation at the county level
crime_df_agg = crime_df[crime_df['report_type']=='Rate Per 100,000'].drop('population',\
axis=1).groupby(['county_name', \
'year'],\
as_index=False).mean()
school_df_agg = school_df.groupby(['year', 'county_name']).agg(MeanScore=('score', 'mean')).reset_index()
tax_df_agg = tax_df.groupby(['year', 'county_name']).agg(tax_rate=('tax_rate', 'mean')).reset_index()
food_df_agg = food_df.groupby(['county_name'],as_index=False).agg({'la1and10':'sum','lahalfand10':'sum',
'la1and20':'sum'})
adi_df_agg = adi_df.groupby(['county_name'],as_index=False).agg({'ADI_NATRANK':'mean',
'ADI_STATERNK':'mean'})
all_df = pd.merge(crime_df_agg,school_df_agg,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,tax_df_agg,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,zhvi_df,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,zori_df,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,pop_df,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,pov_mhi_df,how='inner',on=['county_name','year'])
all_df.head()
| county_name | year | murder | rape | robbery | assault | burglary | larceny | auto_theft | total | ... | num_of_bedrooms | house_value_index | observed_rent_index | est_pop | median_hh_income | poverty_count | poverty_rate | st_abb | state_code | county_code | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATLANTIC | 2017 | 6.50 | 13.73 | 78.73 | 118.58 | 427.38 | 1,695.56 | 81.70 | 2,422.18 | ... | 1 | 81,887.96 | 1,303.83 | 265446 | 59309 | 38092 | 14.40 | NJ | 034 | 001 |
| 1 | ATLANTIC | 2017 | 6.50 | 13.73 | 78.73 | 118.58 | 427.38 | 1,695.56 | 81.70 | 2,422.18 | ... | 2 | 125,858.11 | 1,303.83 | 265446 | 59309 | 38092 | 14.40 | NJ | 034 | 001 |
| 2 | ATLANTIC | 2017 | 6.50 | 13.73 | 78.73 | 118.58 | 427.38 | 1,695.56 | 81.70 | 2,422.18 | ... | 3 | 175,185.58 | 1,303.83 | 265446 | 59309 | 38092 | 14.40 | NJ | 034 | 001 |
| 3 | ATLANTIC | 2017 | 6.50 | 13.73 | 78.73 | 118.58 | 427.38 | 1,695.56 | 81.70 | 2,422.18 | ... | 4 | 245,768.02 | 1,303.83 | 265446 | 59309 | 38092 | 14.40 | NJ | 034 | 001 |
| 4 | ATLANTIC | 2017 | 6.50 | 13.73 | 78.73 | 118.58 | 427.38 | 1,695.56 | 81.70 | 2,422.18 | ... | 5 | 398,474.97 | 1,303.83 | 265446 | 59309 | 38092 | 14.40 | NJ | 034 | 001 |
5 rows × 22 columns
corr = all_df.corr()
fig = px.imshow(corr, aspect="auto",height=1000, title ='Correlation heatmap for Historical data by Plotly')
fig.show()
House value index has strong positive correlation with number of bedrooms and has weak positive correlation with school score, rent index, population and median household income. It has weak negative correlation with poverty rate but weak positive correlation with poverty count.
# joinig all 2020 rows
all_2020_df=pd.merge(crime_df_agg[crime_df_agg['year']==2020], \
pov_mhi_df[pov_mhi_df['year']==2020].drop(['st_abb', 'state_code', 'county_code'],axis=1),\
how='inner', on = ['county_name', 'year']).drop('year',axis=1)
all_2020_df=pd.merge(all_2020_df, school_df_agg[school_df_agg['year']==2020].drop('year',axis=1), how='inner',\
on = 'county_name')
all_2020_df=pd.merge(all_2020_df, zhvi_df[zhvi_df['year']==2020].drop('year',\
axis=1).groupby('county_name',\
as_index=False).agg({'house_value_index':'mean'}),\
how='inner', on = 'county_name')
all_2020_df=pd.merge(all_2020_df, zori_df[zori_df['year']==2020].drop('year',axis=1),\
how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,tax_df_agg[tax_df_agg['year']==2020].drop('year',axis=1),\
how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,pop_df[pop_df['year']==2020].drop('year',axis=1),\
how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,food_df_agg, how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,adi_df_agg, how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,distance_df, how='inner', on='county_name')
all_2020_df
| county_name | murder | rape | robbery | assault | burglary | larceny | auto_theft | total | median_hh_income | ... | ADI_STATERNK | county_code | dist_to_manhattan | dist_to_queens | dist_to_brooklyn | dist_to_bronx | dist_to_jersery_city | dist_to_newark | dist_to_trenton | dist_to_philadelphia | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATLANTIC | 3.00 | 12.68 | 41.55 | 95.54 | 274.96 | 1,287.86 | 81.83 | 1,797.43 | 61377 | ... | 8.03 | 001 | 51.68 | 62.88 | 52.48 | 61.56 | 49.36 | 41.69 | 14.77 | 35.00 |
| 1 | BERGEN | 0.41 | 4.33 | 10.48 | 29.65 | 70.74 | 2,243.08 | 94.14 | 2,452.84 | 107114 | ... | 3.42 | 003 | 6.66 | 17.59 | 7.74 | 14.58 | 5.08 | 8.16 | 49.47 | 77.68 |
| 2 | BURLINGTON | 2.56 | 17.19 | 18.47 | 98.75 | 117.88 | 720.39 | 56.77 | 1,032.01 | 92471 | ... | 6.66 | 005 | 48.51 | 60.21 | 49.41 | 58.52 | 46.04 | 37.76 | 8.89 | 34.31 |
| 3 | CAMDEN | 1.71 | 8.70 | 50.15 | 142.79 | 230.76 | 1,551.97 | 96.20 | 2,082.29 | 71718 | ... | 8.26 | 007 | 68.29 | 80.19 | 69.24 | 78.31 | 65.76 | 57.13 | 15.81 | 14.27 |
| 4 | CUMBERLAND | 3.85 | 18.35 | 87.75 | 231.95 | 286.87 | 1,164.85 | 78.72 | 1,872.30 | 60352 | ... | 9.34 | 011 | 80.51 | 92.24 | 81.43 | 90.53 | 78.03 | 69.57 | 28.73 | 10.91 |
| 5 | ESSEX | 2.11 | 8.33 | 31.91 | 53.10 | 96.19 | 564.39 | 141.56 | 897.59 | 65528 | ... | 5.76 | 013 | 16.81 | 29.00 | 17.93 | 26.46 | 14.19 | 5.20 | 37.03 | 65.25 |
| 6 | GLOUCESTER | 2.30 | 2.87 | 21.88 | 68.83 | 152.47 | 825.12 | 57.14 | 1,130.61 | 87220 | ... | 7.39 | 015 | 80.88 | 92.83 | 81.85 | 90.90 | 78.33 | 69.63 | 28.00 | 4.41 |
| 7 | HUDSON | 0.56 | 9.37 | 36.61 | 81.19 | 83.30 | 696.22 | 71.81 | 979.08 | 77323 | ... | 4.76 | 017 | 4.75 | 16.94 | 5.88 | 14.56 | 2.14 | 6.94 | 48.51 | 76.84 |
| 8 | HUNTERDON | 1.24 | 0.78 | 0.20 | 11.95 | 58.86 | 294.44 | 22.16 | 389.63 | 113611 | ... | 4.00 | 019 | 62.91 | 75.10 | 63.97 | 72.72 | 60.29 | 51.27 | 12.35 | 20.54 |
| 9 | MERCER | 2.20 | 7.93 | 19.97 | 56.78 | 61.80 | 329.56 | 35.63 | 513.86 | 87581 | ... | 6.46 | 021 | 48.91 | 60.99 | 49.92 | 58.89 | 46.33 | 37.52 | 4.19 | 32.50 |
| 10 | MIDDLESEX | 1.02 | 6.07 | 14.75 | 62.93 | 80.65 | 570.99 | 49.09 | 785.51 | 95610 | ... | 5.29 | 023 | 28.57 | 40.64 | 29.57 | 38.57 | 26.00 | 17.32 | 24.50 | 52.84 |
| 11 | MONMOUTH | 0.12 | 11.56 | 17.81 | 67.53 | 133.32 | 993.27 | 109.26 | 1,332.86 | 104219 | ... | 3.97 | 025 | 17.28 | 28.53 | 18.04 | 27.10 | 15.10 | 9.51 | 37.28 | 65.45 |
| 12 | MORRIS | 0.44 | 4.73 | 6.26 | 29.35 | 80.26 | 484.50 | 57.23 | 662.77 | 114103 | ... | 3.43 | 027 | 37.43 | 49.62 | 38.55 | 47.01 | 34.81 | 25.78 | 19.04 | 45.92 |
| 13 | OCEAN | 0.44 | 8.53 | 19.43 | 84.27 | 129.56 | 1,010.39 | 97.07 | 1,349.69 | 78181 | ... | 6.08 | 029 | 25.30 | 35.83 | 25.93 | 34.81 | 23.28 | 17.69 | 32.58 | 60.01 |
| 14 | PASSAIC | 1.20 | 5.97 | 31.24 | 80.28 | 81.72 | 599.26 | 67.47 | 867.17 | 64422 | ... | 6.16 | 031 | 21.34 | 33.23 | 22.50 | 30.35 | 18.82 | 10.48 | 35.20 | 62.91 |
| 15 | SALEM | 30.65 | 34.56 | 80.32 | 180.04 | 315.64 | 799.85 | 111.89 | 1,552.90 | 65563 | ... | 8.71 | 033 | 96.06 | 108.03 | 97.04 | 106.07 | 93.51 | 84.76 | 43.08 | 15.12 |
| 16 | SOMERSET | 0.30 | 4.56 | 9.13 | 12.22 | 84.49 | 630.63 | 62.93 | 804.28 | 115573 | ... | 4.07 | 035 | 42.42 | 54.60 | 43.48 | 52.28 | 39.80 | 30.82 | 11.78 | 39.44 |
| 17 | SUSSEX | 1.41 | 8.24 | 0.91 | 28.61 | 47.80 | 262.64 | 17.12 | 366.71 | 92739 | ... | 6.60 | 037 | 48.15 | 60.23 | 49.29 | 57.42 | 45.55 | 36.61 | 17.51 | 39.09 |
| 18 | UNION | 1.79 | 9.50 | 27.22 | 56.83 | 81.37 | 604.69 | 99.97 | 881.40 | 83189 | ... | 5.48 | 039 | 21.07 | 33.27 | 22.14 | 30.94 | 18.46 | 9.48 | 32.25 | 60.53 |
| 19 | WARREN | 0.00 | 7.19 | 7.71 | 32.80 | 160.09 | 894.20 | 32.66 | 1,134.63 | 80412 | ... | 7.27 | 041 | 68.78 | 80.98 | 69.88 | 78.41 | 66.16 | 57.11 | 20.14 | 19.81 |
20 rows × 31 columns
corr = all_2020_df.corr()
fig = px.imshow(corr, aspect="auto",height=1000, title ='Correlation heatmap for 2020 data by Plotly')
fig.show()
House Price index have strong negative correlation with distance to all major cities except Trenton and Philadelphia as well as Area Deprivation Index (ADI) - both national and state rankings and burglary.House Price Index has strong positive correlation with Observed Rent Index and moderately strong positive correlation with population, median household income, school scores. House Price Index also have somewhat of a weak to mdoerate positive correlation to food desert count at half mile raidus.
Pvoerty rate has strong negative correlation with school score and median household income and moderately strong positive correlation different violent crimes. We will use poverty rate as a social determinant factor for our predictions.
df_list= [crime_df, pov_mhi_df, school_df, zhvi_df, zori_df, tax_df, apr_df, pop_df,\
food_df, adi_df, distance_df]
df_names=["crime_df", "pov_mhi_df", "school_df", "zhvi_df", "zori_df", "tax_df", "apr_df", "pop_df",\
"food_df", "adi_df", "distance_df"]
# generate profile report
for i,j in zip(df_list,df_names):
profile = ProfileReport(i, title=f"{j.replace('_df','').title()} Report",\
vars={"num": {"low_categorical_threshold": 0}})
profile.to_file(f"../Documents/profiling_report_{j.replace('_df','').title()}.html")
print(f"Report generated for {j.replace('_df','').title()}!")
Report generated for Crime!
Report generated for Pov_Mhi!
Report generated for School!
Report generated for Zhvi!
Report generated for Zori!
Report generated for Tax!
Report generated for Apr!
Report generated for Pop!
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\missingno\missingno.py:250: UserWarning:
FixedFormatter should only be used together with FixedLocator
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\missing.py:89: UserWarning:
There was an attempt to generate the Count missing values diagrams, but this failed.
To hide this warning, disable the calculation
(using `df.profile_report(missing_diagrams={"Count": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'The number of FixedLocator locations (7), usually from a call to set_ticks, does not match the number of ticklabels (148).')
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
Report generated for Food!
Report generated for Adi!
Report generated for Distance!